3. Computing Course Probabilities


In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import time
from collections import Counter

do_prints = False
use_STI = True

Get the list of courses in the correct order


In [2]:
if use_STI :
    courses = pd.read_pickle("../data/cleaned_courses_STI.pickle")
else:
    courses = pd.read_pickle("../data/cleaned_courses.pickle")
courses = courses.index.tolist()

In [3]:
probs = pd.DataFrame(index = courses, columns = courses).fillna(0)
probs.head()


Out[3]:
BIOENG-404 BIOENG-430 BIOENG-433 BIOENG-437 BIOENG-442 BIOENG-444 BIOENG-445 BIOENG-447 BIOENG-448 BIOENG-449 ... MSE-468 MSE-471 MSE-472 MSE-474 MSE-477 MSE-479 MSE-480 MSE-484 MSE-485 MSE-486
BIOENG-404 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BIOENG-430 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BIOENG-433 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BIOENG-437 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BIOENG-442 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 196 columns


In [4]:
if use_STI:
    enrol = pd.read_pickle("../data/cleaned_enrol_STI.pickle")
else:
    enrol = pd.read_pickle("../data/cleaned_enrol.pickle")
enrol.head()


Out[4]:
PersonID SubjectID Year Semester SubjectName SectionName StudentSectionCode StudyPlanCode CourseCodes CourseSection CourseID
0 121a95937d2fd38b397050d2302bea86 d162d389a20a07e05d949d84ef18009a 2008-2009 Master semestre 1 Biomécanique du système musculosquelettique Physique PH MA1 ME-482 ME 482
1 554c480f3c0f208acd4e7d93be3121e1 e866f2f224b09fdb5cdd4e06e72afd79 2008-2009 Master semestre 1 Biomechanics of the cardiovascular system Physique PH MA1 ME-481 ME 481
2 71f79d27544c73098e1dc9b7f3df30f2 7816d5304afa93e3a34e413dd069194b 2008-2009 Master semestre 1 Identification et commande I Génie électrique et électronique EL MA3 ME-421 ME 421
3 cc8be618a108fcfd0315506d8ab0dba7 7816d5304afa93e3a34e413dd069194b 2008-2009 Master semestre 1 Identification et commande I Génie électrique et électronique EL MA3 ME-421 ME 421
4 c6f357505bb066b5a89d7e7b01dfac93 7816d5304afa93e3a34e413dd069194b 2008-2009 Master semestre 1 Identification et commande I Génie électrique et électronique EL MA3 ME-421 ME 421

New Proba


In [5]:
Students=enrol['PersonID'].unique()

In [6]:
students_courses_df=enrol[['PersonID','CourseCodes']].groupby('PersonID').apply(lambda x: x.to_dict('list')['CourseCodes'])
students_courses_dico=students_courses_df.to_dict()

In [7]:
import itertools

In [8]:
weights_wt_students=np.zeros((len(courses),len(courses))) # weight matrix 1

w1=1 # weight for each edge

for person in tqdm(Students): # for each student ...
    for course1, course2 in itertools.combinations(students_courses_dico[person], 2):
        if(course1==course2):
            # enlever les redoublants
            print("is there?")
            continue
        probs.loc[course1,course2]+=w1 # add a weight between the courses
        probs.loc[course2,course1]+=w1 # add a weight between the courses


100%|██████████| 5715/5715 [01:31<00:00, 62.57it/s]

In [9]:
for course in tqdm(courses):
    student_per_course=(enrol.loc[ enrol["CourseCodes"] == course , "PersonID"].count())
    probs[probs.index==course]=probs[probs.index==course]/student_per_course
    probs.loc[course,course]=1.


100%|██████████| 196/196 [00:01<00:00, 170.52it/s]

In [10]:
probs.head()


Out[10]:
BIOENG-404 BIOENG-430 BIOENG-433 BIOENG-437 BIOENG-442 BIOENG-444 BIOENG-445 BIOENG-447 BIOENG-448 BIOENG-449 ... MSE-468 MSE-471 MSE-472 MSE-474 MSE-477 MSE-479 MSE-480 MSE-484 MSE-485 MSE-486
BIOENG-404 1.000000 0.043478 0.000000 0.017391 0.634783 0.139130 0.156522 0.008696 0.565217 0.321739 ... 0.000000 0.052174 0.000000 0.000000 0.078261 0.000000 0.000000 0.000000 0.026087 0.000000
BIOENG-430 0.027027 1.000000 0.264865 0.751351 0.270270 0.054054 0.021622 0.037838 0.027027 0.070270 ... 0.000000 0.064865 0.000000 0.016216 0.027027 0.016216 0.000000 0.000000 0.005405 0.000000
BIOENG-433 0.000000 0.532609 1.000000 0.608696 0.315217 0.076087 0.086957 0.054348 0.010870 0.076087 ... 0.000000 0.010870 0.000000 0.000000 0.021739 0.043478 0.000000 0.000000 0.000000 0.021739
BIOENG-437 0.007220 0.501805 0.202166 1.000000 0.324910 0.144404 0.039711 0.104693 0.032491 0.115523 ... 0.000000 0.187726 0.000000 0.007220 0.039711 0.043321 0.000000 0.000000 0.003610 0.003610
BIOENG-442 0.164414 0.112613 0.065315 0.202703 1.000000 0.171171 0.085586 0.069820 0.227477 0.308559 ... 0.004505 0.189189 0.015766 0.085586 0.137387 0.020270 0.045045 0.054054 0.065315 0.056306

5 rows × 196 columns


In [12]:
probs.to_pickle(os.path.join("Graphs","probs.pkl"))

In [ ]: